library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)

SNAP enrollment data

The SNAP enrollment data was acquired from the American Community Survey 5-year estimates of 2022 from the Census Bureau. The original dataset contained estimates for SNAP/Food stamp enrollment at the census tract level as well as other households attributes at the census tract level: estimate and percent for distribution of race and ethnicity, poverty status, disability status, presentation of children under age of 18, etc.

We selected the variables that were relevant to our analysis to build the final dataset, the final variables included were the following:

SNAP = 
  read_csv(file = "./Datasets/ACS_SNAP_ENROLLMENT.csv")|>
  janitor::clean_names()|>
  select(
    geo_id:s2201_c01_001e,
    s2201_c02_009e,
    s2201_c02_015e,
    s2201_c02_021e:s2201_c02_028e,
    s2201_c02_032e,
    s2201_c02_036e:s2201_c02_038e,
    s2201_c04_001e)|>
  select(-ends_with("m"))|>
  slice(-1)|>
  mutate(geoid = str_remove(geo_id, ".*US"))|>
  rename(
     total_ct_households = s2201_c01_001e,
     ph_with_children = s2201_c02_009e,
     ph_no_children = s2201_c02_015e,
     ph_below_poverty = s2201_c02_021e,
     ph_above_poverty = s2201_c02_022e,
     ph_disability = s2201_c02_023e,
     ph_no_disability = s2201_c02_024e,
     ph_white = s2201_c02_025e,
     ph_black = s2201_c02_026e,
     ph_aian = s2201_c02_027e,
     ph_asian = s2201_c02_028e,
     ph_hispanic = s2201_c02_032e,
     ph_no_work = s2201_c02_036e,
     ph_1_work = s2201_c02_037e,
     ph_2_work = s2201_c02_038e,
     ph_snap = s2201_c04_001e
  )|>
  mutate(across(total_ct_households:ph_snap, as.numeric),
         ph_other_race = 100-ph_white-ph_black-ph_asian,
         ph_non_hispanic = 100-ph_hispanic)|>
  select(geoid,name,ph_snap,everything())|>
  select(-geo_id)
## Rows: 2330 Columns: 458
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (458): "GEO_ID", NAME, S2201_C01_001E, S2201_C01_001M, S2201_C01_002E, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: There were 16 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(total_ct_households:ph_snap, as.numeric)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 15 remaining warnings.
head(SNAP)|>
  knitr::kable()|>
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), font_size = 12) %>% 
  kableExtra::scroll_box(width = "100%", height = "300px")
geoid name ph_snap total_ct_households ph_with_children ph_no_children ph_below_poverty ph_above_poverty ph_disability ph_no_disability ph_white ph_black ph_aian ph_asian ph_hispanic ph_no_work ph_1_work ph_2_work ph_other_race ph_non_hispanic
36061000100 Census Tract 1; New York County; New York NA 0 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
36061000201 Census Tract 2.01; New York County; New York 47.6 878 37.8 62.2 35.0 65.0 22.2 77.8 28.4 6.3 0.0 19.8 57.5 43.2 18.8 38.0 45.5 42.5
36061000202 Census Tract 2.02; New York County; New York 42.5 3293 17.6 82.4 34.5 65.5 25.1 74.9 39.9 14.2 0.5 15.5 40.4 23.3 35.0 41.7 30.4 59.6
36061000500 Census Tract 5; New York County; New York NA 0 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
36061000600 Census Tract 6; New York County; New York 54.8 5191 13.1 86.9 42.3 57.7 24.1 75.9 15.3 7.1 0.3 53.1 25.3 37.1 26.9 36.0 24.5 74.7
36061000700 Census Tract 7; New York County; New York 1.0 4873 11.8 88.2 4.1 95.9 2.2 97.8 75.5 4.5 0.0 15.8 4.0 0.0 35.1 64.9 4.2 96.0

Historic Redlining Score Data

The historic redlining score was obtained from the calculation by the National Community Reinvestment Coalition(NCRC) based on the

redlining = 
  read_excel("./Datasets/Historic Redlining Score 2020B.xlsx")|>
  janitor::clean_names()|>
  mutate(fip = substr(geoid20, 1, 5))|>
  filter(fip %in% c("36005", "36047", "36061", "36081", "36085"))|>
  rename(geoid = geoid20)

matching the historic redlining score with snap enrollment data

redlining_snap = 
  redlining|>
  left_join(SNAP, by = "geoid")|>
  filter(is.na(ph_snap)==FALSE)

healthy store data

We obtained the geoid for the census tracts based on the fip code of NYC, boroughs and specific census tract codes.

nyc_healthy_store = 
  read_csv("./Datasets/Recognized_Shop_Healthy_Stores.csv")|>
  janitor::clean_names()|>
  distinct(bin, .keep_all = TRUE)|>
  mutate(fipcode = case_when(
    borough == "Bronx" ~ "36005",
    borough == "Brooklyn" ~ "36047",
    borough == "New York" ~ "36061"
  ),
  ct_label = case_when(
    census_tract_2020 < 100 ~ paste0("00", census_tract_2020, "00"),
    census_tract_2020 < 1000 ~ paste0("0", census_tract_2020, "00"),
    census_tract_2020 < 1200 ~ paste0(census_tract_2020, "00"),
    census_tract_2020 < 10000 ~ paste0("00", census_tract_2020),
    census_tract_2020 < 100000 ~ paste0("0", census_tract_2020)
  ))|>
  select(store_name,borough,zip_code,latitude,longitude,fipcode, ct_label)|>
  filter(!is.na(ct_label))|>
  mutate(geoid = paste(fipcode, ct_label, sep = ""))|>
  select(-fipcode, -ct_label)
## Rows: 675 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Store Name, Street Address, Borough, Neighborhood Tabulation Area ...
## dbl (10): Zip 
## Code, Year Awarded, Program 
## Wave, Latitude, Longitude, Com...
## lgl  (1): Address
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

We obtain the count for the number of healthy food stores in the census tracts and matched that with the snap enrollment data for New York City, Bronx and Brooklyn Borough.

please note that healthy store count is only available in three boroughs so make sure you filter before the match

health_store_count = 
nyc_healthy_store|>
  group_by(geoid)|>
  summarise(count_healthy_stores=n())

redlining_snap|>
  filter(str_starts(geoid, "36005") | str_starts(geoid, "36047")| str_starts(geoid, "36061"))|>
  left_join(health_store_count,by="geoid")
## # A tibble: 1,321 × 26
##    geoid        cbsa metro_name         hrs2020 interval2020 fip   name  ph_snap
##    <chr>       <dbl> <chr>                <dbl>        <dbl> <chr> <chr>   <dbl>
##  1 36061008603 35620 New York-Newark-J…       1            1 36061 Cens…     0  
##  2 36061010200 35620 New York-Newark-J…       1            1 36061 Cens…     3.1
##  3 36061010601 35620 New York-Newark-J…       1            1 36061 Cens…     0  
##  4 36061011401 35620 New York-Newark-J…       1            1 36061 Cens…     1.2
##  5 36061011402 35620 New York-Newark-J…       1            1 36061 Cens…     0.8
##  6 36061012000 35620 New York-Newark-J…       1            1 36061 Cens…     0.8
##  7 36061012200 35620 New York-Newark-J…       1            1 36061 Cens…     0  
##  8 36061012800 35620 New York-Newark-J…       1            1 36061 Cens…     0  
##  9 36061013000 35620 New York-Newark-J…       1            1 36061 Cens…     1.1
## 10 36061014200 35620 New York-Newark-J…       1            1 36061 Cens…     0  
## # ℹ 1,311 more rows
## # ℹ 18 more variables: total_ct_households <dbl>, ph_with_children <dbl>,
## #   ph_no_children <dbl>, ph_below_poverty <dbl>, ph_above_poverty <dbl>,
## #   ph_disability <dbl>, ph_no_disability <dbl>, ph_white <dbl>,
## #   ph_black <dbl>, ph_aian <dbl>, ph_asian <dbl>, ph_hispanic <dbl>,
## #   ph_no_work <dbl>, ph_1_work <dbl>, ph_2_work <dbl>, ph_other_race <dbl>,
## #   ph_non_hispanic <dbl>, count_healthy_stores <int>

PLACE data

place_crude =
  read_csv("./Datasets/PLACES.csv")|>
  janitor::clean_names()|>
  rename(geoid = location_id)|>
  select(geoid, measure:data_value, measure_id)
## Rows: 73621 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): StateAbbr, StateDesc, CountyName, DataSource, Category, Measure, D...
## dbl (10): Year, CountyFIPS, LocationName, Data_Value, Low_Confidence_Limit, ...
## lgl  (2): Data_Value_Footnote_Symbol, Data_Value_Footnote
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
place_cleaned=
  place_crude|>
  filter(measure_id %in% c("CANCER","DIABETES", "HIGHCHOL", "OBESITY"))|>
  pivot_wider(
    id_cols = geoid,
    names_from = measure_id,
    values_from = data_value
  )|>
  mutate(geoid = as.character(geoid))|>
  janitor::clean_names()

code book for place measures

place_crude|>
  distinct(measure, data_value_unit, measure_id)|>
  filter(measure_id %in% c("CANCER","DIABETES", "HIGHCHOL", "OBESITY"))|>
  knitr::kable()
measure data_value_unit measure_id
Obesity among adults % OBESITY
Diagnosed diabetes among adults % DIABETES
High cholesterol among adults who have ever been screened % HIGHCHOL
Cancer (non-skin) or melanoma among adults % CANCER

Comments: I think we may or may not need the following data in analysis, but if someone want to use that, we will include them in the data cleaning, or I will just delete them

poverty level [optional for use ]

poverty = 
  read_csv(file = "./Datasets/ACS-poverty-level.csv", skip=1)|>
  janitor::clean_names()|>
  mutate(geoid = str_remove(geography, ".*US"))
## New names:
## Rows: 2327 Columns: 375
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (146): Geography, Geographic Area Name, Estimate!!Total!!UNRELATED INDIV... dbl
## (228): Estimate!!Total!!Population for whom poverty status is determined... lgl
## (1): ...375
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...375`

NYC census demographic [optional for use]

For this data we have more detailed demographic by age division, and slightly different way of measuring race and ethnicty (Hispanic White non-Hispanic,Black non-Hispanic, Asian non-Hispanic, Some other race non-Hispanic, Non-Hispanic of two or more races). In case this will be interesting for EDA or analysis

demo = 
  read_excel("./Datasets/NYC_census_core_data.xlsx")|>
  janitor::clean_names()|>
  filter(geo_type == "CT2020")|>
  rename(geoid = geo_id)